Part 3 of the analysis. Continued directly from KP_model_data_filtering_part2.ipynb.
np.unique(filtered_assn)
array(['AT1/AT2', 'B cell', 'Basophil', 'Blood Endothelial', 'Ciliated',
'Fibroblast', 'Low Quality', 'Lymphatic Endothelial',
'Mono/Mac/DC', 'Neutrophil', 'Platelet', 'T/NK cell'], dtype=object)
# myeloid lineage, genes with expression in >10 cells
MYELOIDINDICES = filtered_assn.loc[filtered_assn.isin(['Mono/Mac/DC'])].index
GSELECT = (counts_df.loc[MYELOIDINDICES,:] > 0).sum(axis=0) > 10
myeloid_count = counts_df.loc[MYELOIDINDICES,GSELECT]
myeloid_normlog = normlog_df.loc[MYELOIDINDICES,GSELECT]
myeloid_count.shape
myeloid_info = filtered_info.loc[MYELOIDINDICES,:]
myeloid_confounders = confounders.loc[MYELOIDINDICES,:]
myeloid_normlog.shape
(5486, 12786)
pca_myeloid = PCA(n_components=500, svd_solver='randomized')
myeloid_pcaproj = pd.DataFrame(pca_myeloid.fit_transform(myeloid_normlog),
index=myeloid_normlog.index)
myeloid_tsneproj_saved = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/myeloid_analysis/myeloid_data/myeloid_tsneproj_50PCs_p100.csv',
index_col=0)
myeloid_cluster_saved = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/myeloid_analysis/myeloid_data/myeloid_cluster_50PCs_k30.csv',
index_col = 0, header = None)
# generally lower library size for myeloids
sns.distplot(np.log10(myeloid_info['libsize']))
plt.axvline(np.log10(2500), c = 'r')
/opt/miniconda3/envs/scri_env/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<matplotlib.lines.Line2D at 0x7fdf88f49dc0>
myeloid_confounders.columns
Index(['SampleID', 'name', 'background', 'condition', 'replicate',
'experiment', 'ncells', 'libsize', 'mito_fraction', 'ribo_fraction',
'mhc_fraction', 'actin_fraction', 'cytoskeleton_fraction',
'malat1_fraction', 'scrublet_predict', 'scrublet_score', 'lowlibsize'],
dtype='object')
fig = plt.figure(figsize = (8*3, 6))
ax = fig.add_subplot(1, 3, 1)
ax.scatter(myeloid_tsneproj_saved['x'], myeloid_tsneproj_saved['y'], c = np.log2(myeloid_info['libsize']), s = 1)
ax.axis('off')
ax.set_title('Colored by log2 library size')
ax = fig.add_subplot(1, 3, 2)
ax.scatter(myeloid_tsneproj_saved['x'][~myeloid_confounders['scrublet_predict']],
myeloid_tsneproj_saved['y'][~myeloid_confounders['scrublet_predict']], s = 1, label = 'F')
ax.scatter(myeloid_tsneproj_saved['x'][myeloid_confounders['scrublet_predict']],
myeloid_tsneproj_saved['y'][myeloid_confounders['scrublet_predict']], c = 'r', s = 10, label = 'T')
ax.axis('off')
ax.set_title('Scrublet prediction')
ax.legend()
ax = fig.add_subplot(1, 3, 3)
new_cluster = myeloid_cluster_saved.values.flatten()
for j, item in enumerate(np.unique(new_cluster)):
ax.scatter(myeloid_tsneproj_saved['x'][new_cluster == item],
myeloid_tsneproj_saved['y'][new_cluster == item],
s = 1, c = cols[j], label = item)
ax.legend(markerscale = 5, fontsize = 10, ncol = 1)
ax.axis('off');
# removing low quality, doublet with light libsize filtering
FSELECT = (myeloid_info['libsize'] < 2500) | np.isin(myeloid_cluster_saved.values.flatten(), [3,13,15,18])
fig = plt.figure(figsize = (8, 6))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(myeloid_tsneproj_saved['x'][~FSELECT],
myeloid_tsneproj_saved['y'][~FSELECT], s = 1, label = 'Keep')
ax.scatter(myeloid_tsneproj_saved['x'][FSELECT],
myeloid_tsneproj_saved['y'][FSELECT], c = 'r', s = 1, label = 'Filter')
ax.axis('off')
ax.set_title('Scrublet prediction')
ax.legend(markerscale = 10)
<matplotlib.legend.Legend at 0x7fe242b1a250>
filtered_myeloid_count = myeloid_count.loc[~FSELECT,:]
filtered_myeloid_normlog = myeloid_normlog.loc[~FSELECT,:]
filtered_myeloid_info = myeloid_info.loc[~FSELECT,:]
filtered_myeloid_confounders = myeloid_confounders.loc[~FSELECT,:]
pca_myeloid = PCA(n_components=1000, svd_solver='randomized')
filtered_myeloid_pcaproj = pd.DataFrame(pca_myeloid.fit_transform(filtered_myeloid_normlog),
index=filtered_myeloid_normlog.index)
filtered_myeloid_tsneproj_saved = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/myeloid_analysis_v2/myeloid_data/myeloid_clean_tsneproj_50PCs_p100.csv',
index_col = 0)
communities = {}
for k in [20, 25, 30, 35, 40, 45]:
print(k)
communities[str(k)], _, _ = phenograph.cluster(filtered_myeloid_pcaproj.iloc[:,0:50],k=k,
clustering_algo = 'leiden',
resolution_parameter = 1)
#filtered_cluster = pd.Series(communities,index=filtered_pcaproj.index, name='cluster')
20 Finding 20 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2835540771484375 seconds Jaccard graph constructed in 2.6331043243408203 seconds Running Leiden optimization Leiden completed in 0.266467809677124 seconds Sorting communities by size, please wait ... PhenoGraph completed in 5.2155232429504395 seconds 25 Finding 25 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.30427074432373047 seconds Jaccard graph constructed in 2.1657848358154297 seconds Running Leiden optimization Leiden completed in 0.481842041015625 seconds Sorting communities by size, please wait ... PhenoGraph completed in 4.185959815979004 seconds 30 Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3395352363586426 seconds Jaccard graph constructed in 2.8546090126037598 seconds Running Leiden optimization Leiden completed in 0.3244011402130127 seconds Sorting communities by size, please wait ... PhenoGraph completed in 5.265318155288696 seconds 35 Finding 35 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2863459587097168 seconds Jaccard graph constructed in 3.273237943649292 seconds Running Leiden optimization Leiden completed in 0.306473970413208 seconds Sorting communities by size, please wait ... PhenoGraph completed in 5.237275838851929 seconds 40 Finding 40 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.37388181686401367 seconds Jaccard graph constructed in 2.6137490272521973 seconds Running Leiden optimization Leiden completed in 0.5405428409576416 seconds Sorting communities by size, please wait ... PhenoGraph completed in 5.083359956741333 seconds 45 Finding 45 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3252220153808594 seconds Jaccard graph constructed in 2.8850088119506836 seconds Running Leiden optimization Leiden completed in 0.3508789539337158 seconds Sorting communities by size, please wait ... PhenoGraph completed in 5.1738879680633545 seconds
res_mat = np.zeros(shape = (6, 6))
for j, item in enumerate(communities.keys()):
for k, item2 in enumerate(communities.keys()):
res_mat[j, k] = adjusted_rand_score(communities[item], communities[item2])
plt.figure(figsize= (8, 6))
plt.imshow(res_mat, vmin = 0, vmax = 1, cmap = 'magma')
plt.colorbar()
plt.xticks(range(6), communities.keys());
plt.yticks(range(6), communities.keys());
#plt.grid()#which='minor', color='w', linestyle='-', linewidth=5)
plt.xlabel('k')
plt.ylabel('k')
plt.title('Adjusted Rand Index')
Text(0.5, 1.0, 'Adjusted Rand Index')
filtered_myeloid_cluster_saved = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/myeloid_analysis_v2/myeloid_data/myeloid_filtered_leiden_50PCs_k30_res1.csv',
index_col=0, squeeze=True, names=['cluster'])
filtered_myeloid_cluster_saved.values
array([6, 5, 7, ..., 0, 2, 7])
# classical, non-classical, neutrophil-like monocyte
# macrophage, alveolar macrophage
# cDC2, CCR7 cDC2, cDC1, pDC
MYELOIDMARKERS = ['FN1', 'F13A1', 'VCAN', 'LY6C1', 'LY6C2', 'CCR2', # classical monocyte
'ITGAL', 'CX3CR1', 'FCGR4', 'CD300E', 'ACE', # non-classical monocyte
'S100A8', 'S100A9', 'CSF3R', 'IL1B', # neutrophil-like monocyte
'CLEC10A', 'H2-AB1', 'CD74', 'FLT3', # MonoDC
'CSF1R', 'MRC1', 'APOE', 'C1QA', 'C1QB', 'C1QC', 'CTSB', 'CTSD', # macrophage canonical
'MARCO', 'KRT79', 'KRT19', 'CAR4', 'CHIL3', # alveolar macrophage
'CCL17', 'CCR5', 'MAFB', 'CD86', # mouse macrophage 1
'GPNMB', 'SPP1', 'CD68', # mouse macrophage 2 - 'FABP6',
'SAA3', 'SERPINB2', # mouse macrophage 3 - 'ALOX15', 'PRG4',
'ZBTB46', 'CD207',
'IRF4', 'CEBPB', 'LILRB4A', 'ITGAX', # cDC2 - 'CD1C', 'CD1A', 'CD1E', 'FCER1A', 'TNFRSF12',
'FSCN1', 'CCR7', 'LY75', 'CCL22', 'CD40', 'BIRC3', 'NFKB2', 'IL12B',
'MARCKSL1', 'CD274', 'TNFRSF9', 'SEMA7A', 'STAT4', # CCR7 cDC2
'XCR1', 'CLEC9A', 'CADM1', 'BATF3', 'IRF8', # cDC1 - 'TBHD',
'TCF4', 'PLD4', 'BCL11A', 'RUNX2', 'SIGLECH', 'CCR9', 'BST2' # pDC - 'CLEC4C',
]
print(len(MYELOIDMARKERS))
72
fig = plt.figure(figsize = (8*4, 6*22))
for j, item in enumerate(MYELOIDMARKERS):
ax = fig.add_subplot(22, 4, j + 1)
c = filtered_myeloid_normlog[item]
im1 = ax.scatter(filtered_myeloid_tsneproj_saved['x'], filtered_myeloid_tsneproj_saved['y'], s = 3,
c = c, vmin = np.percentile(c, 1), vmax = np.percentile(c, 99))
ax.axis('off')
ax.set_title(item)
fig.colorbar(im1)